1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.lucene.uninverting;
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collection;
24 import java.util.Collections;
25 import java.util.List;
26
27 import org.apache.lucene.codecs.PostingsFormat; // javadocs
28 import org.apache.lucene.index.DocValues;
29 import org.apache.lucene.index.DocValuesType;
30 import org.apache.lucene.index.PostingsEnum;
31 import org.apache.lucene.index.FieldInfo;
32 import org.apache.lucene.index.LeafReader;
33 import org.apache.lucene.index.SortedSetDocValues;
34 import org.apache.lucene.index.Terms;
35 import org.apache.lucene.index.TermsEnum;
36 import org.apache.lucene.search.DocIdSetIterator;
37 import org.apache.lucene.util.Accountable;
38 import org.apache.lucene.util.Bits;
39 import org.apache.lucene.util.BytesRef;
40 import org.apache.lucene.util.PagedBytes;
41 import org.apache.lucene.util.StringHelper;
42
43 /**
44 * This class enables fast access to multiple term ords for
45 * a specified field across all docIDs.
46 *
47 * Like FieldCache, it uninverts the index and holds a
48 * packed data structure in RAM to enable fast access.
49 * Unlike FieldCache, it can handle multi-valued fields,
50 * and, it does not hold the term bytes in RAM. Rather, you
51 * must obtain a TermsEnum from the {@link #getOrdTermsEnum}
52 * method, and then seek-by-ord to get the term's bytes.
53 *
54 * While normally term ords are type long, in this API they are
55 * int as the internal representation here cannot address
56 * more than MAX_INT unique terms. Also, typically this
57 * class is used on fields with relatively few unique terms
58 * vs the number of documents. In addition, there is an
59 * internal limit (16 MB) on how many bytes each chunk of
60 * documents may consume. If you trip this limit you'll hit
61 * an IllegalStateException.
62 *
63 * Deleted documents are skipped during uninversion, and if
64 * you look them up you'll get 0 ords.
65 *
66 * The returned per-document ords do not retain their
67 * original order in the document. Instead they are returned
68 * in sorted (by ord, ie term's BytesRef comparator) order. They
69 * are also de-dup'd (ie if doc has same term more than once
70 * in this field, you'll only get that ord back once).
71 *
72 * This class
73 * will create its own term index internally, allowing to
74 * create a wrapped TermsEnum that can handle ord. The
75 * {@link #getOrdTermsEnum} method then provides this
76 * wrapped enum.
77 *
78 * The RAM consumption of this class can be high!
79 *
80 * @lucene.experimental
81 */
82
83 /*
84 * Final form of the un-inverted field:
85 * Each document points to a list of term numbers that are contained in that document.
86 *
87 * Term numbers are in sorted order, and are encoded as variable-length deltas from the
88 * previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A
89 * term number of 0 signals the end of the termNumber list.
90 *
91 * There is a single int[maxDoc()] which either contains a pointer into a byte[] for
92 * the termNumber lists, or directly contains the termNumber list if it fits in the 4
93 * bytes of an integer. If the first byte in the integer is 1, the next 3 bytes
94 * are a pointer into a byte[] where the termNumber list starts.
95 *
96 * There are actually 256 byte arrays, to compensate for the fact that the pointers
97 * into the byte arrays are only 3 bytes long. The correct byte array for a document
98 * is a function of its id.
99 *
100 * To save space and speed up faceting, any term that matches enough documents will
101 * not be un-inverted... it will be skipped while building the un-inverted field structure,
102 * and will use a set intersection method during faceting.
103 *
104 * To further save memory, the terms (the actual string values) are not all stored in
105 * memory, but a TermIndex is used to convert term numbers to term values only
106 * for the terms needed after faceting has completed. Only every 128th term value
107 * is stored, along with its corresponding term number, and this is used as an
108 * index to find the closest term and iterate until the desired number is hit (very
109 * much like Lucene's own internal term index).
110 *
111 */
112
113 public class DocTermOrds implements Accountable {
114
115 // Term ords are shifted by this, internally, to reserve
116 // values 0 (end term) and 1 (index is a pointer into byte array)
117 private final static int TNUM_OFFSET = 2;
118
119 /** Every 128th term is indexed, by default. */
120 public final static int DEFAULT_INDEX_INTERVAL_BITS = 7; // decrease to a low number like 2 for testing
121
122 private int indexIntervalBits;
123 private int indexIntervalMask;
124 private int indexInterval;
125
126 /** Don't uninvert terms that exceed this count. */
127 protected final int maxTermDocFreq;
128
129 /** Field we are uninverting. */
130 protected final String field;
131
132 /** Number of terms in the field. */
133 protected int numTermsInField;
134
135 /** Total number of references to term numbers. */
136 protected long termInstances;
137 private long memsz;
138
139 /** Total time to uninvert the field. */
140 protected int total_time;
141
142 /** Time for phase1 of the uninvert process. */
143 protected int phase1_time;
144
145 /** Holds the per-document ords or a pointer to the ords. */
146 protected int[] index;
147
148 /** Holds term ords for documents. */
149 protected byte[][] tnums = new byte[256][];
150
151 /** Total bytes (sum of term lengths) for all indexed terms.*/
152 protected long sizeOfIndexedStrings;
153
154 /** Holds the indexed (by default every 128th) terms. */
155 protected BytesRef[] indexedTermsArray = new BytesRef[0];
156
157 /** If non-null, only terms matching this prefix were
158 * indexed. */
159 protected BytesRef prefix;
160
161 /** Ordinal of the first term in the field, or 0 if the
162 * {@link PostingsFormat} does not implement {@link
163 * TermsEnum#ord}. */
164 protected int ordBase;
165
166 /** Used while uninverting. */
167 protected PostingsEnum postingsEnum;
168
169 /** Returns total bytes used. */
170 public long ramBytesUsed() {
171 // can cache the mem size since it shouldn't change
172 if (memsz!=0) return memsz;
173 long sz = 8*8 + 32; // local fields
174 if (index != null) sz += index.length * 4;
175 if (tnums!=null) {
176 for (byte[] arr : tnums)
177 if (arr != null) sz += arr.length;
178 }
179 memsz = sz;
180 return sz;
181 }
182
183 @Override
184 public Collection<Accountable> getChildResources() {
185 return Collections.emptyList();
186 }
187
188 /** Inverts all terms */
189 public DocTermOrds(LeafReader reader, Bits liveDocs, String field) throws IOException {
190 this(reader, liveDocs, field, null, Integer.MAX_VALUE);
191 }
192
193 // TODO: instead of all these ctors and options, take termsenum!
194
195 /** Inverts only terms starting w/ prefix */
196 public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix) throws IOException {
197 this(reader, liveDocs, field, termPrefix, Integer.MAX_VALUE);
198 }
199
200 /** Inverts only terms starting w/ prefix, and only terms
201 * whose docFreq (not taking deletions into account) is
202 * <= maxTermDocFreq */
203 public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix, int maxTermDocFreq) throws IOException {
204 this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS);
205 }
206
207 /** Inverts only terms starting w/ prefix, and only terms
208 * whose docFreq (not taking deletions into account) is
209 * <= maxTermDocFreq, with a custom indexing interval
210 * (default is every 128nd term). */
211 public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits) throws IOException {
212 this(field, maxTermDocFreq, indexIntervalBits);
213 uninvert(reader, liveDocs, termPrefix);
214 }
215
216 /** Subclass inits w/ this, but be sure you then call
217 * uninvert, only once */
218 protected DocTermOrds(String field, int maxTermDocFreq, int indexIntervalBits) {
219 //System.out.println("DTO init field=" + field + " maxTDFreq=" + maxTermDocFreq);
220 this.field = field;
221 this.maxTermDocFreq = maxTermDocFreq;
222 this.indexIntervalBits = indexIntervalBits;
223 indexIntervalMask = 0xffffffff >>> (32-indexIntervalBits);
224 indexInterval = 1 << indexIntervalBits;
225 }
226
227 /**
228 * Returns a TermsEnum that implements ord, or null if no terms in field.
229 * <p>
230 * we build a "private" terms
231 * index internally (WARNING: consumes RAM) and use that
232 * index to implement ord. This also enables ord on top
233 * of a composite reader. The returned TermsEnum is
234 * unpositioned. This returns null if there are no terms.
235 * </p>
236 * <p><b>NOTE</b>: you must pass the same reader that was
237 * used when creating this class
238 */
239 public TermsEnum getOrdTermsEnum(LeafReader reader) throws IOException {
240 // NOTE: see LUCENE-6529 before attempting to optimize this method to
241 // return a TermsEnum directly from the reader if it already supports ord().
242
243 assert null != indexedTermsArray;
244
245 if (0 == indexedTermsArray.length) {
246 return null;
247 } else {
248 return new OrdWrappedTermsEnum(reader);
249 }
250 }
251
252 /**
253 * Returns the number of terms in this field
254 */
255 public int numTerms() {
256 return numTermsInField;
257 }
258
259 /**
260 * Returns {@code true} if no terms were indexed.
261 */
262 public boolean isEmpty() {
263 return index == null;
264 }
265
266 /** Subclass can override this */
267 protected void visitTerm(TermsEnum te, int termNum) throws IOException {
268 }
269
270 /** Invoked during {@link #uninvert(org.apache.lucene.index.LeafReader,Bits,BytesRef)}
271 * to record the document frequency for each uninverted
272 * term. */
273 protected void setActualDocFreq(int termNum, int df) throws IOException {
274 }
275
276 /** Call this only once (if you subclass!) */
277 protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException {
278 final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
279 if (info != null && info.getDocValuesType() != DocValuesType.NONE) {
280 throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
281 }
282 //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
283 final long startTime = System.currentTimeMillis();
284 prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);
285
286 final int maxDoc = reader.maxDoc();
287 final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number
288 final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
289 final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)
290
291 final Terms terms = reader.terms(field);
292 if (terms == null) {
293 // No terms
294 return;
295 }
296
297 final TermsEnum te = terms.iterator();
298 final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
299 //System.out.println("seekStart=" + seekStart.utf8ToString());
300 if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
301 // No terms match
302 return;
303 }
304
305 // For our "term index wrapper"
306 final List<BytesRef> indexedTerms = new ArrayList<>();
307 final PagedBytes indexedTermsBytes = new PagedBytes(15);
308
309 // we need a minimum of 9 bytes, but round up to 12 since the space would
310 // be wasted with most allocators anyway.
311 byte[] tempArr = new byte[12];
312
313 //
314 // enumerate all terms, and build an intermediate form of the un-inverted field.
315 //
316 // During this intermediate form, every document has a (potential) byte[]
317 // and the int[maxDoc()] array either contains the termNumber list directly
318 // or the *end* offset of the termNumber list in its byte array (for faster
319 // appending and faster creation of the final form).
320 //
321 // idea... if things are too large while building, we could do a range of docs
322 // at a time (but it would be a fair amount slower to build)
323 // could also do ranges in parallel to take advantage of multiple CPUs
324
325 // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
326 // values. This requires going over the field first to find the most
327 // frequent terms ahead of time.
328
329 int termNum = 0;
330 postingsEnum = null;
331
332 // Loop begins with te positioned to first term (we call
333 // seek above):
334 for (;;) {
335 final BytesRef t = te.term();
336 if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
337 break;
338 }
339 //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);
340
341 visitTerm(te, termNum);
342
343 if ((termNum & indexIntervalMask) == 0) {
344 // Index this term
345 sizeOfIndexedStrings += t.length;
346 BytesRef indexedTerm = new BytesRef();
347 indexedTermsBytes.copy(t, indexedTerm);
348 // TODO: really should 1) strip off useless suffix,
349 // and 2) use FST not array/PagedBytes
350 indexedTerms.add(indexedTerm);
351 }
352
353 final int df = te.docFreq();
354 if (df <= maxTermDocFreq) {
355
356 postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);
357
358 // dF, but takes deletions into account
359 int actualDF = 0;
360
361 for (;;) {
362 int doc = postingsEnum.nextDoc();
363 if (doc == DocIdSetIterator.NO_MORE_DOCS) {
364 break;
365 }
366 //System.out.println(" chunk=" + chunk + " docs");
367
368 actualDF ++;
369 termInstances++;
370
371 //System.out.println(" docID=" + doc);
372 // add TNUM_OFFSET to the term number to make room for special reserved values:
373 // 0 (end term) and 1 (index into byte array follows)
374 int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
375 lastTerm[doc] = termNum;
376 int val = index[doc];
377
378 if ((val & 0xff)==1) {
379 // index into byte array (actually the end of
380 // the doc-specific byte[] when building)
381 int pos = val >>> 8;
382 int ilen = vIntSize(delta);
383 byte[] arr = bytes[doc];
384 int newend = pos+ilen;
385 if (newend > arr.length) {
386 // We avoid a doubling strategy to lower memory usage.
387 // this faceting method isn't for docs with many terms.
388 // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
389 // TODO: figure out what array lengths we can round up to w/o actually using more memory
390 // (how much space does a byte[] take up? Is data preceded by a 32 bit length only?
391 // It should be safe to round up to the nearest 32 bits in any case.
392 int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
393 byte[] newarr = new byte[newLen];
394 System.arraycopy(arr, 0, newarr, 0, pos);
395 arr = newarr;
396 bytes[doc] = newarr;
397 }
398 pos = writeInt(delta, arr, pos);
399 index[doc] = (pos<<8) | 1; // update pointer to end index in byte[]
400 } else {
401 // OK, this int has data in it... find the end (a zero starting byte - not
402 // part of another number, hence not following a byte with the high bit set).
403 int ipos;
404 if (val==0) {
405 ipos=0;
406 } else if ((val & 0x0000ff80)==0) {
407 ipos=1;
408 } else if ((val & 0x00ff8000)==0) {
409 ipos=2;
410 } else if ((val & 0xff800000)==0) {
411 ipos=3;
412 } else {
413 ipos=4;
414 }
415
416 //System.out.println(" ipos=" + ipos);
417
418 int endPos = writeInt(delta, tempArr, ipos);
419 //System.out.println(" endpos=" + endPos);
420 if (endPos <= 4) {
421 //System.out.println(" fits!");
422 // value will fit in the integer... move bytes back
423 for (int j=ipos; j<endPos; j++) {
424 val |= (tempArr[j] & 0xff) << (j<<3);
425 }
426 index[doc] = val;
427 } else {
428 // value won't fit... move integer into byte[]
429 for (int j=0; j<ipos; j++) {
430 tempArr[j] = (byte)val;
431 val >>>=8;
432 }
433 // point at the end index in the byte[]
434 index[doc] = (endPos<<8) | 1;
435 bytes[doc] = tempArr;
436 tempArr = new byte[12];
437 }
438 }
439 }
440 setActualDocFreq(termNum, actualDF);
441 }
442
443 termNum++;
444 if (te.next() == null) {
445 break;
446 }
447 }
448
449 numTermsInField = termNum;
450
451 long midPoint = System.currentTimeMillis();
452
453 if (termInstances == 0) {
454 // we didn't invert anything
455 // lower memory consumption.
456 tnums = null;
457 } else {
458
459 this.index = index;
460
461 //
462 // transform intermediate form into the final form, building a single byte[]
463 // at a time, and releasing the intermediate byte[]s as we go to avoid
464 // increasing the memory footprint.
465 //
466
467 for (int pass = 0; pass<256; pass++) {
468 byte[] target = tnums[pass];
469 int pos=0; // end in target;
470 if (target != null) {
471 pos = target.length;
472 } else {
473 target = new byte[4096];
474 }
475
476 // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
477 // where pp is the pass (which array we are building), and xx is all values.
478 // each pass shares the same byte[] for termNumber lists.
479 for (int docbase = pass<<16; docbase<maxDoc; docbase+=(1<<24)) {
480 int lim = Math.min(docbase + (1<<16), maxDoc);
481 for (int doc=docbase; doc<lim; doc++) {
482 //System.out.println(" pass=" + pass + " process docID=" + doc);
483 int val = index[doc];
484 if ((val&0xff) == 1) {
485 int len = val >>> 8;
486 //System.out.println(" ptr pos=" + pos);
487 index[doc] = (pos<<8)|1; // change index to point to start of array
488 if ((pos & 0xff000000) != 0) {
489 // we only have 24 bits for the array index
490 throw new IllegalStateException("Too many values for UnInvertedField faceting on field "+field);
491 }
492 byte[] arr = bytes[doc];
493 /*
494 for(byte b : arr) {
495 //System.out.println(" b=" + Integer.toHexString((int) b));
496 }
497 */
498 bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
499 if (target.length <= pos + len) {
500 int newlen = target.length;
501 /*** we don't have to worry about the array getting too large
502 * since the "pos" param will overflow first (only 24 bits available)
503 if ((newlen<<1) <= 0) {
504 // overflow...
505 newlen = Integer.MAX_VALUE;
506 if (newlen <= pos + len) {
507 throw new SolrException(400,"Too many terms to uninvert field!");
508 }
509 } else {
510 while (newlen <= pos + len) newlen<<=1; // doubling strategy
511 }
512 ****/
513 while (newlen <= pos + len) newlen<<=1; // doubling strategy
514 byte[] newtarget = new byte[newlen];
515 System.arraycopy(target, 0, newtarget, 0, pos);
516 target = newtarget;
517 }
518 System.arraycopy(arr, 0, target, pos, len);
519 pos += len + 1; // skip single byte at end and leave it 0 for terminator
520 }
521 }
522 }
523
524 // shrink array
525 if (pos < target.length) {
526 byte[] newtarget = new byte[pos];
527 System.arraycopy(target, 0, newtarget, 0, pos);
528 target = newtarget;
529 }
530
531 tnums[pass] = target;
532
533 if ((pass << 16) > maxDoc)
534 break;
535 }
536
537 }
538 indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);
539
540 long endTime = System.currentTimeMillis();
541
542 total_time = (int)(endTime-startTime);
543 phase1_time = (int)(midPoint-startTime);
544 }
545
546 /** Number of bytes to represent an unsigned int as a vint. */
547 private static int vIntSize(int x) {
548 if ((x & (0xffffffff << (7*1))) == 0 ) {
549 return 1;
550 }
551 if ((x & (0xffffffff << (7*2))) == 0 ) {
552 return 2;
553 }
554 if ((x & (0xffffffff << (7*3))) == 0 ) {
555 return 3;
556 }
557 if ((x & (0xffffffff << (7*4))) == 0 ) {
558 return 4;
559 }
560 return 5;
561 }
562
563 // todo: if we know the size of the vInt already, we could do
564 // a single switch on the size
565 private static int writeInt(int x, byte[] arr, int pos) {
566 int a;
567 a = (x >>> (7*4));
568 if (a != 0) {
569 arr[pos++] = (byte)(a | 0x80);
570 }
571 a = (x >>> (7*3));
572 if (a != 0) {
573 arr[pos++] = (byte)(a | 0x80);
574 }
575 a = (x >>> (7*2));
576 if (a != 0) {
577 arr[pos++] = (byte)(a | 0x80);
578 }
579 a = (x >>> (7*1));
580 if (a != 0) {
581 arr[pos++] = (byte)(a | 0x80);
582 }
583 arr[pos++] = (byte)(x & 0x7f);
584 return pos;
585 }
586
587 /**
588 * "wrap" our own terms index around the original IndexReader.
589 * Only valid if there are terms for this field rom the original reader
590 */
591 private final class OrdWrappedTermsEnum extends TermsEnum {
592 private final TermsEnum termsEnum;
593 private BytesRef term;
594 private long ord = -indexInterval-1; // force "real" seek
595
596 public OrdWrappedTermsEnum(LeafReader reader) throws IOException {
597 assert indexedTermsArray != null;
598 assert 0 != indexedTermsArray.length;
599 termsEnum = reader.fields().terms(field).iterator();
600 }
601
602 @Override
603 public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
604 return termsEnum.postings(reuse, flags);
605 }
606
607 @Override
608 public BytesRef term() {
609 return term;
610 }
611
612 @Override
613 public BytesRef next() throws IOException {
614 if (++ord < 0) {
615 ord = 0;
616 }
617 if (termsEnum.next() == null) {
618 term = null;
619 return null;
620 }
621 return setTerm(); // this is extra work if we know we are in bounds...
622 }
623
624 @Override
625 public int docFreq() throws IOException {
626 return termsEnum.docFreq();
627 }
628
629 @Override
630 public long totalTermFreq() throws IOException {
631 return termsEnum.totalTermFreq();
632 }
633
634 @Override
635 public long ord() {
636 return ordBase + ord;
637 }
638
639 @Override
640 public SeekStatus seekCeil(BytesRef target) throws IOException {
641
642 // already here
643 if (term != null && term.equals(target)) {
644 return SeekStatus.FOUND;
645 }
646
647 int startIdx = Arrays.binarySearch(indexedTermsArray, target);
648
649 if (startIdx >= 0) {
650 // we hit the term exactly... lucky us!
651 TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
652 assert seekStatus == TermsEnum.SeekStatus.FOUND;
653 ord = startIdx << indexIntervalBits;
654 setTerm();
655 assert term != null;
656 return SeekStatus.FOUND;
657 }
658
659 // we didn't hit the term exactly
660 startIdx = -startIdx-1;
661
662 if (startIdx == 0) {
663 // our target occurs *before* the first term
664 TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
665 assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
666 ord = 0;
667 setTerm();
668 assert term != null;
669 return SeekStatus.NOT_FOUND;
670 }
671
672 // back up to the start of the block
673 startIdx--;
674
675 if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) {
676 // we are already in the right block and the current term is before the term we want,
677 // so we don't need to seek.
678 } else {
679 // seek to the right block
680 TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]);
681 assert seekStatus == TermsEnum.SeekStatus.FOUND;
682 ord = startIdx << indexIntervalBits;
683 setTerm();
684 assert term != null; // should be non-null since it's in the index
685 }
686
687 while (term != null && term.compareTo(target) < 0) {
688 next();
689 }
690
691 if (term == null) {
692 return SeekStatus.END;
693 } else if (term.compareTo(target) == 0) {
694 return SeekStatus.FOUND;
695 } else {
696 return SeekStatus.NOT_FOUND;
697 }
698 }
699
700 @Override
701 public void seekExact(long targetOrd) throws IOException {
702 int delta = (int) (targetOrd - ordBase - ord);
703 //System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord + " ii=" + indexInterval);
704 if (delta < 0 || delta > indexInterval) {
705 final int idx = (int) (targetOrd >>> indexIntervalBits);
706 final BytesRef base = indexedTermsArray[idx];
707 //System.out.println(" do seek term=" + base.utf8ToString());
708 ord = idx << indexIntervalBits;
709 delta = (int) (targetOrd - ord);
710 final TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(base);
711 assert seekStatus == TermsEnum.SeekStatus.FOUND;
712 } else {
713 //System.out.println("seek w/in block");
714 }
715
716 while (--delta >= 0) {
717 BytesRef br = termsEnum.next();
718 if (br == null) {
719 assert false;
720 return;
721 }
722 ord++;
723 }
724
725 setTerm();
726 assert term != null;
727 }
728
729 private BytesRef setTerm() throws IOException {
730 term = termsEnum.term();
731 //System.out.println(" setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix == null ? "null" : prefix.utf8ToString()));
732 if (prefix != null && !StringHelper.startsWith(term, prefix)) {
733 term = null;
734 }
735 return term;
736 }
737 }
738
739 /** Returns the term ({@link BytesRef}) corresponding to
740 * the provided ordinal. */
741 public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
742 termsEnum.seekExact(ord);
743 return termsEnum.term();
744 }
745
746 /** Returns a SortedSetDocValues view of this instance */
747 public SortedSetDocValues iterator(LeafReader reader) throws IOException {
748 if (isEmpty()) {
749 return DocValues.emptySortedSet();
750 } else {
751 return new Iterator(reader);
752 }
753 }
754
755 private class Iterator extends SortedSetDocValues {
756 final LeafReader reader;
757 final TermsEnum te; // used internally for lookupOrd() and lookupTerm()
758 // currently we read 5 at a time (using the logic of the old iterator)
759 final int buffer[] = new int[5];
760 int bufferUpto;
761 int bufferLength;
762
763 private int tnum;
764 private int upto;
765 private byte[] arr;
766
767 Iterator(LeafReader reader) throws IOException {
768 this.reader = reader;
769 this.te = termsEnum();
770 }
771
772 @Override
773 public long nextOrd() {
774 while (bufferUpto == bufferLength) {
775 if (bufferLength < buffer.length) {
776 return NO_MORE_ORDS;
777 } else {
778 bufferLength = read(buffer);
779 bufferUpto = 0;
780 }
781 }
782 return buffer[bufferUpto++];
783 }
784
785 /** Buffer must be at least 5 ints long. Returns number
786 * of term ords placed into buffer; if this count is
787 * less than buffer.length then that is the end. */
788 int read(int[] buffer) {
789 int bufferUpto = 0;
790 if (arr == null) {
791 // code is inlined into upto
792 //System.out.println("inlined");
793 int code = upto;
794 int delta = 0;
795 for (;;) {
796 delta = (delta << 7) | (code & 0x7f);
797 if ((code & 0x80)==0) {
798 if (delta==0) break;
799 tnum += delta - TNUM_OFFSET;
800 buffer[bufferUpto++] = ordBase+tnum;
801 //System.out.println(" tnum=" + tnum);
802 delta = 0;
803 }
804 code >>>= 8;
805 }
806 } else {
807 // code is a pointer
808 for(;;) {
809 int delta = 0;
810 for(;;) {
811 byte b = arr[upto++];
812 delta = (delta << 7) | (b & 0x7f);
813 //System.out.println(" cycle: upto=" + upto + " delta=" + delta + " b=" + b);
814 if ((b & 0x80) == 0) break;
815 }
816 //System.out.println(" delta=" + delta);
817 if (delta == 0) break;
818 tnum += delta - TNUM_OFFSET;
819 //System.out.println(" tnum=" + tnum);
820 buffer[bufferUpto++] = ordBase+tnum;
821 if (bufferUpto == buffer.length) {
822 break;
823 }
824 }
825 }
826
827 return bufferUpto;
828 }
829
830 @Override
831 public void setDocument(int docID) {
832 tnum = 0;
833 final int code = index[docID];
834 if ((code & 0xff)==1) {
835 // a pointer
836 upto = code>>>8;
837 //System.out.println(" pointer! upto=" + upto);
838 int whichArray = (docID >>> 16) & 0xff;
839 arr = tnums[whichArray];
840 } else {
841 //System.out.println(" inline!");
842 arr = null;
843 upto = code;
844 }
845 bufferUpto = 0;
846 bufferLength = read(buffer);
847 }
848
849 @Override
850 public BytesRef lookupOrd(long ord) {
851 try {
852 return DocTermOrds.this.lookupTerm(te, (int) ord);
853 } catch (IOException e) {
854 throw new RuntimeException(e);
855 }
856 }
857
858 @Override
859 public long getValueCount() {
860 return numTerms();
861 }
862
863 @Override
864 public long lookupTerm(BytesRef key) {
865 try {
866 switch (te.seekCeil(key)) {
867 case FOUND:
868 assert te.ord() >= 0;
869 return te.ord();
870 case NOT_FOUND:
871 assert te.ord() >= 0;
872 return -te.ord()-1;
873 default: /* END */
874 return -numTerms()-1;
875 }
876 } catch (IOException e) {
877 throw new RuntimeException(e);
878 }
879 }
880
881 @Override
882 public TermsEnum termsEnum() {
883 try {
884 return getOrdTermsEnum(reader);
885 } catch (IOException e) {
886 throw new RuntimeException(e);
887 }
888 }
889 }
890 }